library(car)
library(ggplot2)
library(reshape2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2

These are not meant to be canonical solutions for this homework question, but rather a sketch of possible approaches for the purpose of discussion in section.

Reading/cleaning data

dat <- read.csv("thatch-ant.dat.txt")
head(dat)
##   Colony Distance Mass Headwidth Headwidth..mm. Size.class
## 1      1        1  109        45          1.895        >43
## 2      1        1  120        43          1.811      40-43
## 3      1        1   94        42          1.768      40-43
## 4      1        1   61        33          1.389      30-34
## 5      1        1   72        41          1.726      40-43
## 6      1        1  134        46          1.937        >43
dat <- dat[which(dat$Colony <= 6), ]
dat <- na.omit(dat)
dim(dat)
## [1] 648   6
n <- dim(dat)[1]

unique(dat$Colony)
## [1] 1 2 3 4 6 5
unique(dat$Distance)
## [1]  1  4  7 10  0
unique(dat$Headwidth)
##  [1] 45 43 42 33 41 46 44 39 40 28 38 37 47 32 29 36 35 34 31 30 48 49 27
unique(dat$Headwidth..mm.)
##  [1] 1.895 1.811 1.768 1.389 1.726 1.937 1.853 1.642 1.684 1.179 1.600
## [12] 1.558 1.979 1.347 1.221 1.516 1.474 1.432 1.305 1.263 2.021 2.063
## [23] 1.137
# plot(dat$Headwidth, dat$Headwidth..mm.)

levels(dat$Size.class)
## [1] "<30"   "\x80"  ">43"   "30-34" "35-39" "40-43"
# re-order Size.class levels
dat$Size.class <- factor(dat$Size.class, levels(dat$Size.class)[c(1, 4, 5, 6, 3, 2)])


dat$Colony <- as.factor(dat$Colony)
dat$Distance <- as.factor(dat$Distance)
dat <- dat[, -5] # remove mm version of headwidth

Exploratory data analysis

p <- ggplot(dat)
p + geom_bar(aes(x=Colony)) + ggtitle("Histogram of colony size")

p + geom_boxplot(aes(x=Colony, y=Mass)) + ggtitle("Boxplots of mass by colony")

p + geom_boxplot(aes(x=Colony, y=Headwidth)) + ggtitle("Boxplots of head width by colony")

# p + geom_bar(aes(x=Colony, fill=Size.class), position="dodge") + ggtitle("Histograms of size class by colony")
dat.size.colony <- melt(dcast(dat, Size.class ~ Colony, fun.aggregate=length), id.vars="Size.class", variable.name="Colony", value.name="Count")
## Using Size.class as value column: use value.var to override.
ggplot(dat.size.colony, aes(x=Colony, y=Count, fill=Size.class)) + geom_bar(stat="identity", position=position_dodge(width=0.8), width=0.7) + geom_text(aes(label=Count), position=position_dodge(width=0.8), vjust=-0.5, size=2.5)

# p + geom_bar(aes(x=Colony, fill=Distance), position="dodge") + ggtitle("Histograms of distance by colony")
dat.distance.colony <- melt(dcast(dat, Distance ~ Colony, fun.aggregate=length), id.vars="Distance", variable.name="Colony", value.name="Count")
## Using Size.class as value column: use value.var to override.
ggplot(dat.distance.colony, aes(x=Colony, y=Count, fill=Distance)) + geom_bar(stat="identity", position=position_dodge(width=0.8), width=0.7) + geom_text(aes(label=Count), position=position_dodge(width=0.8), vjust=-0.5, size=2.5)

#coplot(Mass ~ Distance | Colony, data=dat, rows=1)
dat.tmp <- dat %>% group_by(Colony, Distance) %>% mutate(Mass.mean=mean(Mass)) %>% ungroup
ggplot(dat.tmp, aes(x=as.numeric(as.character(Distance)), y=Mass)) + geom_point(size=0.5) +
  # geom_smooth(method='loess', se=F, method.args=list(degree=1)) +
  geom_line(aes(y=Mass.mean), color='blue') +
  facet_wrap(~Colony, nrow=1) + theme(aspect.ratio=1) +
  ggtitle("Coplot of Mass ~ Distance | Colony") +
  xlab("Distance")

p + geom_point(aes(x=Headwidth, y=Mass), size=0.5) + ggtitle("Mass vs. head width")

# p + geom_point(aes(x=Headwidth, y=Mass)) + facet_wrap(~Colony, nrow=1) + theme(aspect.ratio=1) + ggtitle("Coplot of Mass ~ Headwidth | Colony")

Modeling

When you use lm with one categorical explanatory variable, what dummy variables are used in the model? What happens when you remove the intercept when calling lm? What is the answer to the above questions when there are more than one categorical explanatory variable? Does the order in which you list the explanatory variables matter?

The above sketch considers variations in how you (or lm) choose dummy variables to encode categorical variables. Note that these changes (intercept or no intercept, etc.) will not change the column space of the design matrix, so most things will remain the same (fitted values, residuals, RSS, etc.). However, the coefficients \(\widehat{\beta}\) will be different simply because the dummy variables represent different things depending on how you encode the categorical variables.

mods <- list()
mods <- c(mods, list(lm(Mass ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
# mods <- c(mods, list(lm(sqrt(Mass) ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
mods <- c(mods, list(lm(I(Mass^(0.4)) ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
mods <- c(mods, list(lm(log(Mass) ~ Colony + Distance + Headwidth + Size.class - 1, data=dat)))
mod <- mods[[1]]
print(summary(mod))
## 
## Call:
## lm(formula = Mass ~ Colony + Distance + Headwidth + Size.class - 
##     1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.804  -8.022  -0.555   8.003  51.511 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## Colony1         -98.8873    13.7297  -7.202 1.69e-12 ***
## Colony2         -99.5553    13.5927  -7.324 7.34e-13 ***
## Colony3         -94.7184    13.5871  -6.971 7.92e-12 ***
## Colony4         -99.7989    13.5455  -7.368 5.44e-13 ***
## Colony5         -95.2742    13.5730  -7.019 5.76e-12 ***
## Colony6         -99.2774    13.4595  -7.376 5.14e-13 ***
## Distance1        -6.6332     1.9680  -3.371 0.000796 ***
## Distance4        -9.5713     1.9502  -4.908 1.17e-06 ***
## Distance7        -9.4464     2.0165  -4.685 3.44e-06 ***
## Distance10      -10.5882     2.1188  -4.997 7.54e-07 ***
## Headwidth         5.0542     0.4555  11.097  < 2e-16 ***
## Size.class30-34  -2.2907     4.6612  -0.491 0.623278    
## Size.class35-39  -6.4351     5.8475  -1.100 0.271539    
## Size.class40-43  -4.3846     7.3629  -0.596 0.551722    
## Size.class>43    -0.2594     8.8522  -0.029 0.976627    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.77 on 633 degrees of freedom
## Multiple R-squared:  0.9812, Adjusted R-squared:  0.9808 
## F-statistic:  2207 on 15 and 633 DF,  p-value: < 2.2e-16
fit <- fitted(mod)
rstud <- rstudent(mod)
qplot(fit, rstud, size=I(0.5)) +
  geom_hline(yintercept=0) +
  xlab("Fitted values") +
  ylab("Studentized residuals") +
  geom_smooth(method='loess', se=F, method.args=list(degree=1)) +
  ggtitle("Studentized residuals vs. fitted values")

qplot(log(fit), log(abs(rstud)), size=I(0.5)) +
  geom_smooth(method='lm', se=F) + xlab("log fitted values") + ylab("log abs. stud. res.")

coef(lm(log(abs(rstud)) ~ log(fit)))
## (Intercept)    log(fit) 
##  -3.5125407   0.6031174

for (mod in mods) {
  fitted <- fitted(mod)
  rstud <- rstudent(mod)
  # print(summary(mod))
  # dat.mod <- cbind(dat, Fitted=fitted(mod), Stud.Res=rstudent(mod))
  # print(
  #   qplot(qt((1:n) / (n+1), df.residual(mod)), sort(dat.mod$Stud.Res)) + geom_abline(slope=1)
  #   + xlab("t-distribution quantiles")
  #   + ylab("Sorted studentized residuals")
  #   + ggtitle(sprintf("Q-Q plot for %s model", colnames(mod$model)[1]))
  # )
  print(
    qplot(fitted, rstud, size=I(0.5))
    + geom_hline(yintercept=0)
    + xlab("Fitted values")
    + ylab("Studentized residuals")
    + geom_smooth(method='loess', se=F, method.args=list(degree=1))
    + ggtitle(sprintf("Studentized residuals vs. fitted values\nfor %s model", colnames(mod$model)[1]))
  )
  # print(ggplot(dat.mod, aes(x=log(Fitted), y=log(abs(Stud.Res)))) +
  #         geom_point(size=0.5) + xlab("log Fitted values") + ylab("log Studentized residuals") + geom_smooth(method='lm', se=F))
  # b <- coef(lm(log(abs(Stud.Res))~log(Fitted), data=dat.mod))[2]
  # print(b)
}

for (mod in mods) {
  crPlot(mod, variable="Headwidth",
         main=sprintf("CR plot for head width\nfor %s model", colnames(mod$model)[1]))
}

summary(mods[[3]])
## 
## Call:
## lm(formula = log(Mass) ~ Colony + Distance + Headwidth + Size.class - 
##     1, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.61411 -0.08430  0.00382  0.09391  0.55504 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## Colony1          2.166692   0.144051  15.041  < 2e-16 ***
## Colony2          2.146691   0.142614  15.052  < 2e-16 ***
## Colony3          2.207843   0.142556  15.488  < 2e-16 ***
## Colony4          2.148735   0.142119  15.119  < 2e-16 ***
## Colony5          2.189606   0.142407  15.376  < 2e-16 ***
## Colony6          2.151806   0.141217  15.238  < 2e-16 ***
## Distance1       -0.072842   0.020648  -3.528 0.000450 ***
## Distance4       -0.096494   0.020461  -4.716 2.96e-06 ***
## Distance7       -0.087004   0.021157  -4.112 4.43e-05 ***
## Distance10      -0.100319   0.022231  -4.513 7.63e-06 ***
## Headwidth        0.053252   0.004779  11.143  < 2e-16 ***
## Size.class30-34  0.175366   0.048905   3.586 0.000362 ***
## Size.class35-39  0.250197   0.061352   4.078 5.12e-05 ***
## Size.class40-43  0.285233   0.077251   3.692 0.000241 ***
## Size.class>43    0.296502   0.092877   3.192 0.001481 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1445 on 633 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 4.233e+04 on 15 and 633 DF,  p-value: < 2.2e-16
mods.col <- list()
for (i in 1:length(unique(dat$Colony))) {
  mod.curr <- lm(log(Mass) ~ Distance + Headwidth + Size.class, data=dat, subset=(Colony == i))
  mods.col <- c(mods.col, list(mod.curr))
  print(summary(mod.curr))
  coefs.dist <- coef(mod.curr)[2:5]
  print(
    qplot(
      as.numeric(as.character(levels(dat$Distance))),
      c(0, coefs.dist), 
      geom="line") +
      xlab("Distance") + ylab("Offset from Distance zero") +
      ggtitle(sprintf("Mass differences by distance class,\n Colony %d", i)) +
      ylim(-0.2, 0.2)
  )
}
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat, 
##     subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27804 -0.07940 -0.01233  0.05629  0.41261 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.47044    0.38905   6.350 6.22e-09 ***
## Distance1       -0.13998    0.04812  -2.909 0.004462 ** 
## Distance4       -0.14998    0.04518  -3.320 0.001255 ** 
## Distance7       -0.12899    0.04885  -2.641 0.009588 ** 
## Distance10      -0.17895    0.04863  -3.680 0.000377 ***
## Headwidth        0.04627    0.01310   3.532 0.000624 ***
## Size.class30-34  0.17610    0.10883   1.618 0.108739    
## Size.class35-39  0.30630    0.15334   1.997 0.048463 *  
## Size.class40-43  0.32731    0.19601   1.670 0.098039 .  
## Size.class>43    0.32999    0.23959   1.377 0.171471    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1303 on 101 degrees of freedom
## Multiple R-squared:  0.793,  Adjusted R-squared:  0.7745 
## F-statistic: 42.99 on 9 and 101 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat, 
##     subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62362 -0.07475  0.00107  0.11119  0.26351 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.02183    0.38430   5.261 9.45e-07 ***
## Distance1       -0.02325    0.05272  -0.441  0.66029    
## Distance4       -0.14630    0.05336  -2.742  0.00735 ** 
## Distance7       -0.02323    0.05373  -0.432  0.66656    
## Distance10      -0.06167    0.05317  -1.160  0.24908    
## Headwidth        0.06083    0.01211   5.024 2.51e-06 ***
## Size.class35-39  0.06192    0.08884   0.697  0.48755    
## Size.class40-43  0.05515    0.13351   0.413  0.68052    
## Size.class>43    0.08399    0.17381   0.483  0.63010    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1452 on 91 degrees of freedom
## Multiple R-squared:  0.7985, Adjusted R-squared:  0.7808 
## F-statistic: 45.07 on 8 and 91 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat, 
##     subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32130 -0.07811 -0.00152  0.07140  0.34807 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.390280   0.293065   8.156 1.94e-12 ***
## Distance1       -0.145514   0.045102  -3.226  0.00175 ** 
## Distance4       -0.116551   0.045708  -2.550  0.01247 *  
## Distance7       -0.124728   0.046152  -2.703  0.00823 ** 
## Distance10      -0.126132   0.045695  -2.760  0.00700 ** 
## Headwidth        0.046313   0.009595   4.827 5.64e-06 ***
## Size.class30-34  0.334938   0.102393   3.271  0.00152 ** 
## Size.class35-39  0.334920   0.124359   2.693  0.00844 ** 
## Size.class40-43  0.421685   0.153060   2.755  0.00710 ** 
## Size.class>43    0.464274   0.186393   2.491  0.01458 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.125 on 90 degrees of freedom
## Multiple R-squared:  0.8511, Adjusted R-squared:  0.8362 
## F-statistic: 57.15 on 9 and 90 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat, 
##     subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49421 -0.08658  0.00371  0.08370  0.42056 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.908707   0.419415   4.551 1.42e-05 ***
## Distance1       -0.014610   0.051806  -0.282    0.778    
## Distance4       -0.008673   0.054136  -0.160    0.873    
## Distance7        0.015396   0.054796   0.281    0.779    
## Distance10      -0.050181   0.055344  -0.907    0.367    
## Headwidth        0.062405   0.012720   4.906 3.34e-06 ***
## Size.class35-39  0.075239   0.091273   0.824    0.412    
## Size.class40-43  0.071950   0.129130   0.557    0.579    
## Size.class>43    0.092176   0.171368   0.538    0.592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1464 on 107 degrees of freedom
## Multiple R-squared:  0.7237, Adjusted R-squared:  0.7031 
## F-statistic: 35.04 on 8 and 107 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat, 
##     subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47156 -0.09616  0.01064  0.11597  0.51390 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.20022    0.48538   4.533 1.96e-05 ***
## Distance1       -0.01119    0.06062  -0.185  0.85399    
## Distance4       -0.05887    0.06192  -0.951  0.34453    
## Distance7       -0.16309    0.06245  -2.611  0.01072 *  
## Headwidth        0.04908    0.01620   3.030  0.00327 ** 
## Size.class30-34  0.23009    0.13493   1.705  0.09193 .  
## Size.class35-39  0.38897    0.17548   2.217  0.02942 *  
## Size.class40-43  0.45258    0.24163   1.873  0.06463 .  
## Size.class>43    0.45434    0.29582   1.536  0.12842    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1706 on 82 degrees of freedom
## Multiple R-squared:  0.7758, Adjusted R-squared:  0.7539 
## F-statistic: 35.46 on 8 and 82 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + Size.class, data = dat, 
##     subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42929 -0.06602  0.00576  0.07460  0.44691 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.140719   0.285638   7.495 1.25e-11 ***
## Distance1       -0.101796   0.049021  -2.077   0.0400 *  
## Distance4       -0.108514   0.049340  -2.199   0.0298 *  
## Distance7       -0.095709   0.050832  -1.883   0.0621 .  
## Distance10      -0.114316   0.052651  -2.171   0.0319 *  
## Headwidth        0.055457   0.009934   5.582 1.50e-07 ***
## Size.class30-34  0.096436   0.087659   1.100   0.2735    
## Size.class35-39  0.207890   0.119752   1.736   0.0851 .  
## Size.class40-43  0.223481   0.155390   1.438   0.1530    
## Size.class>43    0.216806   0.190607   1.137   0.2576    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1443 on 120 degrees of freedom
## Multiple R-squared:  0.8498, Adjusted R-squared:  0.8385 
## F-statistic: 75.42 on 9 and 120 DF,  p-value: < 2.2e-16

Alternative models

Transforming head width

mod <- lm(Mass ~ . - 1, data=dat)
crPlot(mod, variable="Headwidth", main="CR plot for head width")

summary(mod)
## 
## Call:
## lm(formula = Mass ~ . - 1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.804  -8.022  -0.555   8.003  51.511 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## Colony1         -98.8873    13.7297  -7.202 1.69e-12 ***
## Colony2         -99.5553    13.5927  -7.324 7.34e-13 ***
## Colony3         -94.7184    13.5871  -6.971 7.92e-12 ***
## Colony4         -99.7989    13.5455  -7.368 5.44e-13 ***
## Colony5         -95.2742    13.5730  -7.019 5.76e-12 ***
## Colony6         -99.2774    13.4595  -7.376 5.14e-13 ***
## Distance1        -6.6332     1.9680  -3.371 0.000796 ***
## Distance4        -9.5713     1.9502  -4.908 1.17e-06 ***
## Distance7        -9.4464     2.0165  -4.685 3.44e-06 ***
## Distance10      -10.5882     2.1188  -4.997 7.54e-07 ***
## Headwidth         5.0542     0.4555  11.097  < 2e-16 ***
## Size.class30-34  -2.2907     4.6612  -0.491 0.623278    
## Size.class35-39  -6.4351     5.8475  -1.100 0.271539    
## Size.class40-43  -4.3846     7.3629  -0.596 0.551722    
## Size.class>43    -0.2594     8.8522  -0.029 0.976627    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.77 on 633 degrees of freedom
## Multiple R-squared:  0.9812, Adjusted R-squared:  0.9808 
## F-statistic:  2207 on 15 and 633 DF,  p-value: < 2.2e-16
mod <- lm(Mass ~ . + I(Headwidth^2) - 1, data=dat)
crPlots(mod, terms=~Headwidth + I(Headwidth^2), main="CR plot for head width")

summary(mod)
## 
## Call:
## lm(formula = Mass ~ . + I(Headwidth^2) - 1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.397  -7.714  -0.633   7.835  50.652 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## Colony1         162.47770   76.03678   2.137  0.03300 *  
## Colony2         161.47260   75.91767   2.127  0.03381 *  
## Colony3         166.41567   75.94658   2.191  0.02880 *  
## Colony4         161.66773   76.03296   2.126  0.03387 *  
## Colony5         165.70549   75.90061   2.183  0.02939 *  
## Colony6         161.76229   75.89766   2.131  0.03345 *  
## Distance1        -6.42364    1.95175  -3.291  0.00105 ** 
## Distance4        -9.43454    1.93351  -4.879 1.35e-06 ***
## Distance7        -8.89666    2.00505  -4.437 1.08e-05 ***
## Distance10      -10.38091    2.10114  -4.941 9.98e-07 ***
## Headwidth        -9.27095    4.12499  -2.248  0.02495 *  
## Size.class30-34  11.74121    6.12200   1.918  0.05558 .  
## Size.class35-39  17.69214    9.01601   1.962  0.05017 .  
## Size.class40-43  20.63467   10.22500   2.018  0.04401 *  
## Size.class>43    20.25507   10.55822   1.918  0.05551 .  
## I(Headwidth^2)    0.17865    0.05113   3.494  0.00051 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.65 on 632 degrees of freedom
## Multiple R-squared:  0.9816, Adjusted R-squared:  0.9811 
## F-statistic:  2106 on 16 and 632 DF,  p-value: < 2.2e-16
mods.col <- list()
for (i in 1:length(unique(dat$Colony))) {
  mod.curr <- lm(log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + Size.class, data=dat, subset=(Colony == i))
  mods.col <- c(mods.col, list(mod.curr))
  print(summary(mod.curr))
  coefs.dist <- coef(mod.curr)[2:5]
  print(
    qplot(
      as.numeric(as.character(levels(dat$Distance))),
      c(0, coefs.dist), 
      geom="line") +
      xlab("Distance") + ylab("Offset from Distance zero") +
      ggtitle(sprintf("Mass differences by distance class,\n Colony %d", i)) +
      ylim(-0.2, 0.2)
  )
}
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27643 -0.07440 -0.01439  0.05836  0.41154 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.198574   3.231483  -0.680 0.497848    
## Distance1       -0.148450   0.048213  -3.079 0.002681 ** 
## Distance4       -0.149359   0.044933  -3.324 0.001241 ** 
## Distance7       -0.129252   0.048581  -2.661 0.009088 ** 
## Distance10      -0.178161   0.048365  -3.684 0.000373 ***
## Headwidth        0.295214   0.171555   1.721 0.088378 .  
## I(Headwidth^2)  -0.002970   0.002041  -1.455 0.148713    
## Size.class30-34 -0.124578   0.233241  -0.534 0.594446    
## Size.class35-39 -0.196618   0.377722  -0.521 0.603841    
## Size.class40-43 -0.215971   0.421139  -0.513 0.609204    
## Size.class>43   -0.172914   0.419748  -0.412 0.681260    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1296 on 100 degrees of freedom
## Multiple R-squared:  0.7973, Adjusted R-squared:  0.777 
## F-statistic: 39.33 on 10 and 100 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62810 -0.07403  0.00608  0.10124  0.26290 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      4.230330   2.087562   2.026   0.0457 *
## Distance1       -0.017700   0.052927  -0.334   0.7388  
## Distance4       -0.140699   0.053563  -2.627   0.0101 *
## Distance7       -0.014969   0.054232  -0.276   0.7832  
## Distance10      -0.060122   0.053140  -1.131   0.2609  
## Headwidth       -0.055050   0.108342  -0.508   0.6126  
## I(Headwidth^2)   0.001453   0.001350   1.076   0.2847  
## Size.class35-39  0.144508   0.117328   1.232   0.2213  
## Size.class40-43  0.146266   0.157990   0.926   0.3570  
## Size.class>43    0.135779   0.180198   0.754   0.4531  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1451 on 90 degrees of freedom
## Multiple R-squared:  0.801,  Adjusted R-squared:  0.7811 
## F-statistic: 40.26 on 9 and 90 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32066 -0.07582 -0.00118  0.07037  0.34189 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      2.0690369  1.6679368   1.240  0.21806   
## Distance1       -0.1444423  0.0456745  -3.162  0.00214 **
## Distance4       -0.1144380  0.0472054  -2.424  0.01736 * 
## Distance7       -0.1254566  0.0465499  -2.695  0.00841 **
## Distance10      -0.1260310  0.0459439  -2.743  0.00736 **
## Headwidth        0.0636913  0.0893347   0.713  0.47774   
## I(Headwidth^2)  -0.0002179  0.0011137  -0.196  0.84531   
## Size.class30-34  0.3233035  0.1188801   2.720  0.00786 **
## Size.class35-39  0.3108879  0.1752603   1.774  0.07951 . 
## Size.class40-43  0.3965094  0.2005823   1.977  0.05116 . 
## Size.class>43    0.4462862  0.2087287   2.138  0.03525 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1257 on 89 degrees of freedom
## Multiple R-squared:  0.8511, Adjusted R-squared:  0.8344 
## F-statistic: 50.89 on 10 and 89 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49312 -0.08631  0.00431  0.08602  0.42549 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)      3.847859   2.578619   1.492    0.139
## Distance1       -0.014751   0.051908  -0.284    0.777
## Distance4       -0.006610   0.054310  -0.122    0.903
## Distance7        0.018057   0.055015   0.328    0.743
## Distance10      -0.052325   0.055524  -0.942    0.348
## Headwidth       -0.038225   0.132639  -0.288    0.774
## I(Headwidth^2)   0.001255   0.001647   0.762    0.448
## Size.class35-39  0.142697   0.127265   1.121    0.265
## Size.class40-43  0.144954   0.160977   0.900    0.370
## Size.class>43    0.136957   0.181478   0.755    0.452
## 
## Residual standard error: 0.1467 on 106 degrees of freedom
## Multiple R-squared:  0.7252, Adjusted R-squared:  0.7019 
## F-statistic: 31.09 on 9 and 106 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47469 -0.09762  0.00590  0.11600  0.50259 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      3.457246   3.130566   1.104   0.2727  
## Distance1       -0.008092   0.061405  -0.132   0.8955  
## Distance4       -0.057925   0.062280  -0.930   0.3551  
## Distance7       -0.160488   0.063095  -2.544   0.0129 *
## Headwidth       -0.019688   0.169964  -0.116   0.9081  
## I(Headwidth^2)   0.000860   0.002116   0.407   0.6854  
## Size.class30-34  0.285857   0.192912   1.482   0.1423  
## Size.class35-39  0.495838   0.316591   1.566   0.1212  
## Size.class40-43  0.564327   0.366824   1.538   0.1278  
## Size.class>43    0.543271   0.369145   1.472   0.1450  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1714 on 81 degrees of freedom
## Multiple R-squared:  0.7762, Adjusted R-squared:  0.7514 
## F-statistic: 31.22 on 9 and 81 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42879 -0.06685  0.00995  0.07635  0.43715 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      3.1763227  1.5421690   2.060   0.0416 *
## Distance1       -0.0992721  0.0492687  -2.015   0.0462 *
## Distance4       -0.1036463  0.0499599  -2.075   0.0402 *
## Distance7       -0.0908888  0.0514312  -1.767   0.0798 .
## Distance10      -0.1093305  0.0532700  -2.052   0.0423 *
## Headwidth       -0.0022259  0.0849913  -0.026   0.9791  
## I(Headwidth^2)   0.0007212  0.0010554   0.683   0.4957  
## Size.class30-34  0.1624245  0.1305447   1.244   0.2159  
## Size.class35-39  0.3149179  0.1973099   1.596   0.1131  
## Size.class40-43  0.3350050  0.2255764   1.485   0.1402  
## Size.class>43    0.3092764  0.2340980   1.321   0.1890  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1447 on 119 degrees of freedom
## Multiple R-squared:  0.8504, Adjusted R-squared:  0.8378 
## F-statistic: 67.63 on 10 and 119 DF,  p-value: < 2.2e-16

Distance as a continuous variable

dat$Distance <- as.numeric(as.character(dat$Distance))
mod <- lm(Mass ~ . - 1, data=dat)
summary(mod)
## 
## Call:
## lm(formula = Mass ~ . - 1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.338  -7.951  -1.057   7.556  50.135 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## Colony1         -107.3491    13.6517  -7.863 1.61e-14 ***
## Colony2         -107.6902    13.5237  -7.963 7.75e-15 ***
## Colony3         -102.8121    13.5194  -7.605 1.03e-13 ***
## Colony4         -108.0597    13.4688  -8.023 4.98e-15 ***
## Colony5         -103.8467    13.4831  -7.702 5.15e-14 ***
## Colony6         -107.7151    13.3733  -8.054 3.95e-15 ***
## Distance          -0.7024     0.1629  -4.311 1.88e-05 ***
## Headwidth          5.1660     0.4582  11.275  < 2e-16 ***
## Size.class30-34   -2.3443     4.6846  -0.500    0.617    
## Size.class35-39   -7.2879     5.8753  -1.240    0.215    
## Size.class40-43   -5.4346     7.4005  -0.734    0.463    
## Size.class>43     -1.6415     8.9006  -0.184    0.854    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.89 on 636 degrees of freedom
## Multiple R-squared:  0.9808, Adjusted R-squared:  0.9805 
## F-statistic:  2711 on 12 and 636 DF,  p-value: < 2.2e-16
crPlots(mod, terms=~Headwidth+Distance)

mod <- lm(Mass ~ . + I(Headwidth^2) - 1, data=dat)
summary(mod)
## 
## Call:
## lm(formula = Mass ~ . + I(Headwidth^2) - 1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.169  -7.493  -0.708   7.546  48.513 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## Colony1         156.8651    76.4647   2.051 0.040631 *  
## Colony2         156.1822    76.3465   2.046 0.041197 *  
## Colony3         161.1700    76.3766   2.110 0.035231 *  
## Colony4         156.2623    76.4631   2.044 0.041402 *  
## Colony5         160.0268    76.3398   2.096 0.036456 *  
## Colony6         156.1846    76.3282   2.046 0.041146 *  
## Distance         -0.6770     0.1616  -4.188 3.21e-05 ***
## Headwidth        -9.3103     4.1483  -2.244 0.025153 *  
## Size.class30-34  11.9365     6.1731   1.934 0.053604 .  
## Size.class35-39  17.2013     9.0869   1.893 0.058815 .  
## Size.class40-43  19.9846    10.3069   1.939 0.052949 .  
## Size.class>43    19.2501    10.6417   1.809 0.070933 .  
## I(Headwidth^2)    0.1805     0.0514   3.511 0.000478 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.77 on 635 degrees of freedom
## Multiple R-squared:  0.9812, Adjusted R-squared:  0.9808 
## F-statistic:  2548 on 13 and 635 DF,  p-value: < 2.2e-16
crPlots(mod, terms=~Headwidth+I(Headwidth^2)+Distance, layout=c(2,2))

mod <- lm(Mass ~ . + I(Headwidth^2) + I(Distance^2) - 1, data=dat)
summary(mod)
## 
## Call:
## lm(formula = Mass ~ . + I(Headwidth^2) + I(Distance^2) - 1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.248  -7.933  -0.541   7.646  50.063 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## Colony1         153.77536   76.18024   2.019 0.043952 *  
## Colony2         152.81856   76.06449   2.009 0.044954 *  
## Colony3         157.78987   76.09452   2.074 0.038519 *  
## Colony4         152.92464   76.18039   2.007 0.045131 *  
## Colony5         157.20031   76.05410   2.067 0.039144 *  
## Colony6         153.00640   76.04486   2.012 0.044637 *  
## Distance         -2.01429    0.57232  -3.519 0.000463 ***
## Headwidth        -9.00295    4.13423  -2.178 0.029799 *  
## Size.class30-34  11.80885    6.14955   1.920 0.055271 .  
## Size.class35-39  16.87972    9.05281   1.865 0.062702 .  
## Size.class40-43  19.78927   10.26743   1.927 0.054378 .  
## Size.class>43    19.41969   10.60082   1.832 0.067435 .  
## I(Headwidth^2)    0.17598    0.05124   3.435 0.000632 ***
## I(Distance^2)     0.13516    0.05551   2.435 0.015169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.72 on 634 degrees of freedom
## Multiple R-squared:  0.9814, Adjusted R-squared:  0.981 
## F-statistic:  2385 on 14 and 634 DF,  p-value: < 2.2e-16
crPlots(mod, terms=~Headwidth+I(Headwidth^2)+Distance+I(Distance^2), layout=c(2,2))

mods.col <- list()
for (i in 1:length(unique(dat$Colony))) {
  mod.curr <- lm(log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + Size.class, data=dat, subset=(Colony == i))
  mods.col <- c(mods.col, list(mod.curr))
  print(summary(mod.curr))
}
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31195 -0.08483 -0.00879  0.06696  0.38096 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     -1.552881   3.308195  -0.469   0.6398  
## Distance        -0.008266   0.003862  -2.140   0.0347 *
## Headwidth        0.251149   0.175496   1.431   0.1554  
## I(Headwidth^2)  -0.002364   0.002085  -1.134   0.2594  
## Size.class30-34 -0.063692   0.240349  -0.265   0.7915  
## Size.class35-39 -0.140551   0.389193  -0.361   0.7187  
## Size.class40-43 -0.172844   0.434286  -0.398   0.6915  
## Size.class>43   -0.171452   0.433246  -0.396   0.6931  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1341 on 103 degrees of freedom
## Multiple R-squared:  0.7766, Adjusted R-squared:  0.7614 
## F-statistic: 51.15 on 7 and 103 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62049 -0.07120  0.01291  0.09140  0.31079 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      4.421071   2.145981   2.060   0.0422 *
## Distance        -0.002676   0.004307  -0.621   0.5358  
## Headwidth       -0.062732   0.111198  -0.564   0.5740  
## I(Headwidth^2)   0.001500   0.001388   1.081   0.2826  
## Size.class35-39  0.129235   0.120054   1.076   0.2845  
## Size.class40-43  0.157579   0.160627   0.981   0.3291  
## Size.class>43    0.151702   0.185315   0.819   0.4151  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1515 on 93 degrees of freedom
## Multiple R-squared:  0.7759, Adjusted R-squared:  0.7615 
## F-statistic: 53.68 on 6 and 93 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32610 -0.07117 -0.00882  0.08325  0.33967 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      1.3780533  1.6161729   0.853   0.3961  
## Distance        -0.0045702  0.0036937  -1.237   0.2191  
## Headwidth        0.0973110  0.0867151   1.122   0.2647  
## I(Headwidth^2)  -0.0006388  0.0010825  -0.590   0.5566  
## Size.class30-34  0.2795899  0.1193517   2.343   0.0213 *
## Size.class35-39  0.2438500  0.1728702   1.411   0.1617  
## Size.class40-43  0.3249633  0.1978444   1.643   0.1039  
## Size.class>43    0.3926782  0.2107608   1.863   0.0656 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1301 on 92 degrees of freedom
## Multiple R-squared:  0.8351, Adjusted R-squared:  0.8226 
## F-statistic: 66.57 on 7 and 92 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46009 -0.08483  0.00447  0.08672  0.40491 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)      3.4220579  2.5502471   1.342    0.182
## Distance        -0.0027550  0.0041094  -0.670    0.504
## Headwidth       -0.0130204  0.1308941  -0.099    0.921
## I(Headwidth^2)   0.0008912  0.0016204   0.550    0.583
## Size.class35-39  0.1299514  0.1260114   1.031    0.305
## Size.class40-43  0.1535099  0.1598058   0.961    0.339
## Size.class>43    0.1667161  0.1795379   0.929    0.355
## 
## Residual standard error: 0.1462 on 109 degrees of freedom
## Multiple R-squared:  0.7193, Adjusted R-squared:  0.7039 
## F-statistic: 46.55 on 6 and 109 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45141 -0.10306  0.00669  0.11302  0.51690 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      3.6075698  3.0739941   1.174  0.24392   
## Distance        -0.0235189  0.0069786  -3.370  0.00114 **
## Headwidth       -0.0250366  0.1668124  -0.150  0.88106   
## I(Headwidth^2)   0.0009065  0.0020741   0.437  0.66320   
## Size.class30-34  0.2869502  0.1903189   1.508  0.13542   
## Size.class35-39  0.4975452  0.3136175   1.586  0.11644   
## Size.class40-43  0.5711825  0.3633973   1.572  0.11980   
## Size.class>43    0.5533816  0.3654573   1.514  0.13377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1699 on 83 degrees of freedom
## Multiple R-squared:  0.7748, Adjusted R-squared:  0.7558 
## F-statistic:  40.8 on 7 and 83 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = log(Mass) ~ Distance + Headwidth + I(Headwidth^2) + 
##     Size.class, data = dat, subset = (Colony == i))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43048 -0.07175  0.00922  0.07637  0.42193 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      3.3008183  1.5433877   2.139   0.0345 *
## Distance        -0.0040700  0.0039060  -1.042   0.2995  
## Headwidth       -0.0140875  0.0849431  -0.166   0.8686  
## I(Headwidth^2)   0.0008959  0.0010540   0.850   0.3970  
## Size.class30-34  0.1591814  0.1310688   1.214   0.2269  
## Size.class35-39  0.3116475  0.1979461   1.574   0.1180  
## Size.class40-43  0.3363065  0.2262235   1.487   0.1397  
## Size.class>43    0.2891843  0.2347295   1.232   0.2203  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1453 on 122 degrees of freedom
## Multiple R-squared:  0.8452, Adjusted R-squared:  0.8363 
## F-statistic: 95.13 on 7 and 122 DF,  p-value: < 2.2e-16
for (i in 1:length(unique(dat$Colony))) {
  print(coef(mods.col[[i]])[2])
}
##     Distance 
## -0.008266139 
##     Distance 
## -0.002676431 
##     Distance 
## -0.004570243 
##     Distance 
## -0.002754967 
##    Distance 
## -0.02351894 
##     Distance 
## -0.004070044